import requests
import threading
import glob
import os
import sys
import json
import re


class Spider:
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
    def __init__(self,
                 url='https://www.fhxiaoshuo.com/',
                 charset='gbk',
                 json = None,
                 ):
        self.url = url
        self.json = json
        self.charset = charset
        self.html = self.html()

    def html(self):
        '''
        返回网页源代码
        '''
        print(self.url)
        res = requests.get(self.url,headers=self.headers,verify=False)
        res.encoding = self.charset
        if self.json != None:
            return res.json()
        return res.text

    def info(self,**regex):
        '''
        在同一网页源代码中，返回所有想得到的正则内容
        '''
        info_dict = {}
        for key,value in regex.items():
            info_dict[key]=re.findall(value,self.html,re.S)
        return info_dict

    def content(self,kongge = '&nbsp;',br ='<br />',
                 regular = '<dd id="contents">(.*?)</dd>'):
        capter_html = requests.get(self.url)
        capter_html.encoding = self.charset
        capter_html = capter_html.text
        #print(capter_html)
        content = re.findall(regular,capter_html,re.S)
        content = content[0].replace(kongge, '').replace(br, '\n')
        #print(content)
        return content      #章节内容
        
    
if __name__ == '__main__':
    # res = requests.get('https://www.fhxiaoshuo.com/')
    # res.encoding = 'gbk'
    # print(res.text)
    xx = Spider().html
    print(xx)













    
